import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from IPython.display import display
#import the data
Airbnb = pd.read_csv("../Project/archive/AB_NYC_2019.csv")
Airbnb.shape
(48895, 16)
Airbnb.head()
| id | name | host_id | host_name | neighbourhood_group | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2539 | Clean & quiet apt home by the park | 2787 | John | Brooklyn | Kensington | 40.64749 | -73.97237 | Private room | 149 | 1 | 9 | 2018-10-19 | 0.21 | 6 | 365 |
| 1 | 2595 | Skylit Midtown Castle | 2845 | Jennifer | Manhattan | Midtown | 40.75362 | -73.98377 | Entire home/apt | 225 | 1 | 45 | 2019-05-21 | 0.38 | 2 | 355 |
| 2 | 3647 | THE VILLAGE OF HARLEM....NEW YORK ! | 4632 | Elisabeth | Manhattan | Harlem | 40.80902 | -73.94190 | Private room | 150 | 3 | 0 | NaN | NaN | 1 | 365 |
| 3 | 3831 | Cozy Entire Floor of Brownstone | 4869 | LisaRoxanne | Brooklyn | Clinton Hill | 40.68514 | -73.95976 | Entire home/apt | 89 | 1 | 270 | 2019-07-05 | 4.64 | 1 | 194 |
| 4 | 5022 | Entire Apt: Spacious Studio/Loft by central park | 7192 | Laura | Manhattan | East Harlem | 40.79851 | -73.94399 | Entire home/apt | 80 | 10 | 9 | 2018-11-19 | 0.10 | 1 | 0 |
Airbnb.isnull().sum()
id 0 name 16 host_id 0 host_name 21 neighbourhood_group 0 neighbourhood 0 latitude 0 longitude 0 room_type 0 price 0 minimum_nights 0 number_of_reviews 0 last_review 10052 reviews_per_month 10052 calculated_host_listings_count 0 availability_365 0 dtype: int64
Airbnb.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 48895 entries, 0 to 48894 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 48895 non-null int64 1 name 48879 non-null object 2 host_id 48895 non-null int64 3 host_name 48874 non-null object 4 neighbourhood_group 48895 non-null object 5 neighbourhood 48895 non-null object 6 latitude 48895 non-null float64 7 longitude 48895 non-null float64 8 room_type 48895 non-null object 9 price 48895 non-null int64 10 minimum_nights 48895 non-null int64 11 number_of_reviews 48895 non-null int64 12 last_review 38843 non-null object 13 reviews_per_month 38843 non-null float64 14 calculated_host_listings_count 48895 non-null int64 15 availability_365 48895 non-null int64 dtypes: float64(3), int64(7), object(6) memory usage: 6.0+ MB
Airbnb[["price","neighbourhood_group","neighbourhood","room_type","minimum_nights","number_of_reviews","reviews_per_month","last_review","calculated_host_listings_count","availability_365"]].describe(include=["object","datetime64","float","int","int64"])
#noticed that there are 0 values in the price
| price | neighbourhood_group | neighbourhood | room_type | minimum_nights | number_of_reviews | reviews_per_month | last_review | calculated_host_listings_count | availability_365 | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 48895.000000 | 48895 | 48895 | 48895 | 48895.000000 | 48895.000000 | 38843.000000 | 38843 | 48895.000000 | 48895.000000 |
| unique | NaN | 5 | 221 | 3 | NaN | NaN | NaN | 1764 | NaN | NaN |
| top | NaN | Manhattan | Williamsburg | Entire home/apt | NaN | NaN | NaN | 2019-06-23 | NaN | NaN |
| freq | NaN | 21661 | 3920 | 25409 | NaN | NaN | NaN | 1413 | NaN | NaN |
| mean | 152.720687 | NaN | NaN | NaN | 7.029962 | 23.274466 | 1.373221 | NaN | 7.143982 | 112.781327 |
| std | 240.154170 | NaN | NaN | NaN | 20.510550 | 44.550582 | 1.680442 | NaN | 32.952519 | 131.622289 |
| min | 0.000000 | NaN | NaN | NaN | 1.000000 | 0.000000 | 0.010000 | NaN | 1.000000 | 0.000000 |
| 25% | 69.000000 | NaN | NaN | NaN | 1.000000 | 1.000000 | 0.190000 | NaN | 1.000000 | 0.000000 |
| 50% | 106.000000 | NaN | NaN | NaN | 3.000000 | 5.000000 | 0.720000 | NaN | 1.000000 | 45.000000 |
| 75% | 175.000000 | NaN | NaN | NaN | 5.000000 | 24.000000 | 2.020000 | NaN | 2.000000 | 227.000000 |
| max | 10000.000000 | NaN | NaN | NaN | 1250.000000 | 629.000000 | 58.500000 | NaN | 327.000000 | 365.000000 |
#get the data ( count and view) for zero values
Airbnb[Airbnb["price"] != 0].nunique()
id 48884 name 47894 host_id 37455 host_name 11450 neighbourhood_group 5 neighbourhood 221 latitude 19046 longitude 14715 room_type 3 price 673 minimum_nights 109 number_of_reviews 394 last_review 1764 reviews_per_month 937 calculated_host_listings_count 47 availability_365 366 dtype: int64
#drop name and host_name column
Airbnb_clean = Airbnb.drop(["name","host_name"],axis=1).copy()
#fill the missing data in reviews_per_month column
Airbnb_clean.fillna({'reviews_per_month':0}, inplace=True)
#changing the data typ for last_review
Airbnb_clean["last_review"] = pd.to_datetime(Airbnb_clean["last_review"])
Airbnb_clean.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 48895 entries, 0 to 48894 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 48895 non-null int64 1 host_id 48895 non-null int64 2 neighbourhood_group 48895 non-null object 3 neighbourhood 48895 non-null object 4 latitude 48895 non-null float64 5 longitude 48895 non-null float64 6 room_type 48895 non-null object 7 price 48895 non-null int64 8 minimum_nights 48895 non-null int64 9 number_of_reviews 48895 non-null int64 10 last_review 38843 non-null datetime64[ns] 11 reviews_per_month 48895 non-null float64 12 calculated_host_listings_count 48895 non-null int64 13 availability_365 48895 non-null int64 dtypes: datetime64[ns](1), float64(3), int64(7), object(3) memory usage: 5.2+ MB
Airbnb_clean.head()
| id | host_id | neighbourhood_group | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2539 | 2787 | Brooklyn | Kensington | 40.64749 | -73.97237 | Private room | 149 | 1 | 9 | 2018-10-19 | 0.21 | 6 | 365 |
| 1 | 2595 | 2845 | Manhattan | Midtown | 40.75362 | -73.98377 | Entire home/apt | 225 | 1 | 45 | 2019-05-21 | 0.38 | 2 | 355 |
| 2 | 3647 | 4632 | Manhattan | Harlem | 40.80902 | -73.94190 | Private room | 150 | 3 | 0 | NaT | 0.00 | 1 | 365 |
| 3 | 3831 | 4869 | Brooklyn | Clinton Hill | 40.68514 | -73.95976 | Entire home/apt | 89 | 1 | 270 | 2019-07-05 | 4.64 | 1 | 194 |
| 4 | 5022 | 7192 | Manhattan | East Harlem | 40.79851 | -73.94399 | Entire home/apt | 80 | 10 | 9 | 2018-11-19 | 0.10 | 1 | 0 |
Airbnb_clean[["price","neighbourhood_group","latitude","longitude","neighbourhood","room_type","minimum_nights","number_of_reviews","reviews_per_month","last_review","calculated_host_listings_count","availability_365"]].describe(include=["object","datetime64","float","int","int64"])
| price | neighbourhood_group | latitude | longitude | neighbourhood | room_type | minimum_nights | number_of_reviews | reviews_per_month | last_review | calculated_host_listings_count | availability_365 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 48895.000000 | 48895 | 48895.000000 | 48895.000000 | 48895 | 48895 | 48895.000000 | 48895.000000 | 48895.000000 | 38843 | 48895.000000 | 48895.000000 |
| unique | NaN | 5 | NaN | NaN | 221 | 3 | NaN | NaN | NaN | NaN | NaN | NaN |
| top | NaN | Manhattan | NaN | NaN | Williamsburg | Entire home/apt | NaN | NaN | NaN | NaN | NaN | NaN |
| freq | NaN | 21661 | NaN | NaN | 3920 | 25409 | NaN | NaN | NaN | NaN | NaN | NaN |
| mean | 152.720687 | NaN | 40.728949 | -73.952170 | NaN | NaN | 7.029962 | 23.274466 | 1.090910 | 2018-10-04 01:47:23.910099456 | 7.143982 | 112.781327 |
| min | 0.000000 | NaN | 40.499790 | -74.244420 | NaN | NaN | 1.000000 | 0.000000 | 0.000000 | 2011-03-28 00:00:00 | 1.000000 | 0.000000 |
| 25% | 69.000000 | NaN | 40.690100 | -73.983070 | NaN | NaN | 1.000000 | 1.000000 | 0.040000 | 2018-07-08 00:00:00 | 1.000000 | 0.000000 |
| 50% | 106.000000 | NaN | 40.723070 | -73.955680 | NaN | NaN | 3.000000 | 5.000000 | 0.370000 | 2019-05-19 00:00:00 | 1.000000 | 45.000000 |
| 75% | 175.000000 | NaN | 40.763115 | -73.936275 | NaN | NaN | 5.000000 | 24.000000 | 1.580000 | 2019-06-23 00:00:00 | 2.000000 | 227.000000 |
| max | 10000.000000 | NaN | 40.913060 | -73.712990 | NaN | NaN | 1250.000000 | 629.000000 | 58.500000 | 2019-07-08 00:00:00 | 327.000000 | 365.000000 |
| std | 240.154170 | NaN | 0.054530 | 0.046157 | NaN | NaN | 20.510550 | 44.550582 | 1.597283 | NaN | 32.952519 | 131.622289 |
Airbnb_clean[Airbnb["price"]==0]
| id | host_id | neighbourhood_group | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 23161 | 18750597 | 8993084 | Brooklyn | Bedford-Stuyvesant | 40.69023 | -73.95428 | Private room | 0 | 4 | 1 | 2018-01-06 | 0.05 | 4 | 28 |
| 25433 | 20333471 | 131697576 | Bronx | East Morrisania | 40.83296 | -73.88668 | Private room | 0 | 2 | 55 | 2019-06-24 | 2.56 | 4 | 127 |
| 25634 | 20523843 | 15787004 | Brooklyn | Bushwick | 40.69467 | -73.92433 | Private room | 0 | 2 | 16 | 2019-05-18 | 0.71 | 5 | 0 |
| 25753 | 20608117 | 1641537 | Brooklyn | Greenpoint | 40.72462 | -73.94072 | Private room | 0 | 2 | 12 | 2017-10-27 | 0.53 | 2 | 0 |
| 25778 | 20624541 | 10132166 | Brooklyn | Williamsburg | 40.70838 | -73.94645 | Entire home/apt | 0 | 5 | 3 | 2018-01-02 | 0.15 | 1 | 73 |
| 25794 | 20639628 | 86327101 | Brooklyn | Bedford-Stuyvesant | 40.68173 | -73.91342 | Private room | 0 | 1 | 93 | 2019-06-15 | 4.28 | 6 | 176 |
| 25795 | 20639792 | 86327101 | Brooklyn | Bedford-Stuyvesant | 40.68279 | -73.91170 | Private room | 0 | 1 | 95 | 2019-06-21 | 4.37 | 6 | 232 |
| 25796 | 20639914 | 86327101 | Brooklyn | Bedford-Stuyvesant | 40.68258 | -73.91284 | Private room | 0 | 1 | 95 | 2019-06-23 | 4.35 | 6 | 222 |
| 26259 | 20933849 | 13709292 | Manhattan | Murray Hill | 40.75091 | -73.97597 | Entire home/apt | 0 | 3 | 0 | NaT | 0.00 | 1 | 0 |
| 26841 | 21291569 | 101970559 | Brooklyn | Bushwick | 40.69211 | -73.90670 | Shared room | 0 | 30 | 2 | 2019-06-22 | 0.11 | 6 | 333 |
| 26866 | 21304320 | 101970559 | Brooklyn | Bushwick | 40.69166 | -73.90928 | Shared room | 0 | 30 | 5 | 2019-05-24 | 0.26 | 6 | 139 |
Airbnb_clean[Airbnb_clean["price"]==0]["price"].value_counts()
price 0 11 Name: count, dtype: int64
# have all numeric data pairs scatter plot to se the correlation
sns.set()
columns=["price","neighbourhood_group","latitude","longitude","neighbourhood","room_type","minimum_nights","number_of_reviews","reviews_per_month","last_review","calculated_host_listings_count","availability_365"]
facet_scatter = sns.pairplot(Airbnb_clean[columns], height=2, kind='scatter', diag_kind='kde')
plt.xticks(rotation=45, horizontalalignment='right')
plt.show()
# plot the correlation between numeric data
corr_cols = ["price","latitude","longitude","minimum_nights","number_of_reviews","reviews_per_month","last_review","calculated_host_listings_count","availability_365"]
correlation = Airbnb_clean[corr_cols].corr()
correlation.style.background_gradient(axis=None)
| price | latitude | longitude | minimum_nights | number_of_reviews | reviews_per_month | last_review | calculated_host_listings_count | availability_365 | |
|---|---|---|---|---|---|---|---|---|---|
| price | 1.000000 | 0.033939 | -0.150019 | 0.042799 | -0.047954 | -0.050564 | -0.085239 | 0.057472 | 0.081829 |
| latitude | 0.033939 | 1.000000 | 0.084788 | 0.024869 | -0.015389 | -0.018758 | -0.029530 | 0.019517 | -0.010983 |
| longitude | -0.150019 | 0.084788 | 1.000000 | -0.062747 | 0.059094 | 0.138516 | 0.044203 | -0.114713 | 0.082731 |
| minimum_nights | 0.042799 | 0.024869 | -0.062747 | 1.000000 | -0.080116 | -0.124905 | -0.111649 | 0.127960 | 0.144303 |
| number_of_reviews | -0.047954 | -0.015389 | 0.059094 | -0.080116 | 1.000000 | 0.589407 | 0.267759 | -0.072376 | 0.172028 |
| reviews_per_month | -0.050564 | -0.018758 | 0.138516 | -0.124905 | 0.589407 | 1.000000 | 0.350466 | -0.047312 | 0.163732 |
| last_review | -0.085239 | -0.029530 | 0.044203 | -0.111649 | 0.267759 | 0.350466 | 1.000000 | -0.117821 | 0.033440 |
| calculated_host_listings_count | 0.057472 | 0.019517 | -0.114713 | 0.127960 | -0.072376 | -0.047312 | -0.117821 | 1.000000 | 0.225701 |
| availability_365 | 0.081829 | -0.010983 | 0.082731 | 0.144303 | 0.172028 | 0.163732 | 0.033440 | 0.225701 | 1.000000 |
top_host=Airbnb_clean.host_id.value_counts().head(10)
Airbnb_clean.host_id.value_counts().head(10)
host_id 219517861 327 107434423 232 30283594 121 137358866 103 16098958 96 12243051 96 61391963 91 22541573 87 200380610 65 7503643 52 Name: count, dtype: int64
top_host.plot(
kind="bar",
title="top listing ",
ylabel="count",
xlabel="host ID"
);
# count how many type of rooms available
Airbnb_clean["room_type"].unique()
Airbnb_clean["room_type"].value_counts()
room_type Entire home/apt 25409 Private room 22326 Shared room 1160 Name: count, dtype: int64
Airbnb_clean["room_type"].value_counts().plot(
kind="bar",
title="type of room count",
ylabel="count",
xlabel="room type"
);
#box plot for the price vs the type of room to see the the outliers
px.box(Airbnb_clean,x="room_type",y="price",title="price vs type of room");
Airbnb['price'].quantile([0.05, 0.25, 0.5, 0.75, 0.95])
0.05 40.0 0.25 69.0 0.50 106.0 0.75 175.0 0.95 355.0 Name: price, dtype: float64
Airbnb_clean_hist=px.histogram(Airbnb_clean,x="price",facet_col="room_type",)
Airbnb_clean_hist.show()
px.box(Airbnb_clean,x="room_type",y="price")
#removing all the values above 400 to remove the outliers , from the above , we see that the highest wesiker is 392, according to Bruce, Bruce and Gedeck (2020) , what ever above that is outlier
Airbnb_clean_outliers = Airbnb_clean.query("price < 355 and price > 40").copy()
Airbnb_clean_outliers_hist=px.histogram(Airbnb_clean_outliers,x="price",facet_col="room_type",nbins=30)
Airbnb_clean_outliers_hist.show()
px.box(Airbnb_clean_outliers,x="room_type",y="price")
Airbnb_clean_outliers_hist= px.histogram(Airbnb_clean_outliers,x="price",nbins=50)
Airbnb_clean_outliers_hist.show()
#get thee number of listing in each neighbourhood_group
neighbourhood_group_bar=Airbnb_clean["neighbourhood_group"].value_counts()
Airbnb_clean["neighbourhood_group"].value_counts()
neighbourhood_group Manhattan 21661 Brooklyn 20104 Queens 5666 Bronx 1091 Staten Island 373 Name: count, dtype: int64
neighbourhood_group_bar.plot(
kind="bar",
title="amount of listing in each neighbourhood group",
ylabel="count",
xlabel="neighbourhood_group"
);
#get thee number of listing in each neighbourhood
neighbourhood_vis=Airbnb_clean["neighbourhood"].value_counts().head(30)
neighbourhood_vis_df =pd.DataFrame(neighbourhood_vis)
neighbourhood_vis_df.reset_index(inplace=True)
# neighbourhood_vis_df.rename(columns={'index':'neighbourhood', 'neighbourhood':'Count'}, inplace=True)
neighbourhood_vis_df
# Airbnb_clean["neighbourhood"].value_counts().head(30)
| neighbourhood | count | |
|---|---|---|
| 0 | Williamsburg | 3920 |
| 1 | Bedford-Stuyvesant | 3714 |
| 2 | Harlem | 2658 |
| 3 | Bushwick | 2465 |
| 4 | Upper West Side | 1971 |
| 5 | Hell's Kitchen | 1958 |
| 6 | East Village | 1853 |
| 7 | Upper East Side | 1798 |
| 8 | Crown Heights | 1564 |
| 9 | Midtown | 1545 |
| 10 | East Harlem | 1117 |
| 11 | Greenpoint | 1115 |
| 12 | Chelsea | 1113 |
| 13 | Lower East Side | 911 |
| 14 | Astoria | 900 |
| 15 | Washington Heights | 899 |
| 16 | West Village | 768 |
| 17 | Financial District | 744 |
| 18 | Flatbush | 621 |
| 19 | Clinton Hill | 572 |
| 20 | Long Island City | 537 |
| 21 | Prospect-Lefferts Gardens | 535 |
| 22 | Park Slope | 506 |
| 23 | East Flatbush | 500 |
| 24 | Fort Greene | 489 |
| 25 | Murray Hill | 485 |
| 26 | Kips Bay | 470 |
| 27 | Flushing | 426 |
| 28 | Ridgewood | 423 |
| 29 | Greenwich Village | 392 |
plt.figure(figsize=(10, 6)) # Adjust the numbers to change the figure size
neighbourhood_vis = sns.barplot(x="neighbourhood", y="Count", data=neighbourhood_vis_df, palette='Blues_d')
neighbourhood_vis.set_title('count of listing in each neighbourhood')
neighbourhood_vis.set_ylabel('Count of listings')
neighbourhood_vis.set_xlabel('neighbourhood')
neighbourhood_vis.set_xticklabels(neighbourhood_vis.get_xticklabels(), rotation=90);
# create Dataframe for the number of listings
nei_group = Airbnb_clean.groupby("neighbourhood_group")["neighbourhood"].value_counts().reset_index(name="count").copy()
nei_group_outliers = Airbnb_clean_outliers.groupby("neighbourhood_group")["neighbourhood"].value_counts().reset_index(name="count").copy()
nei_group
# px.bar(nei_group)
| neighbourhood_group | neighbourhood | count | |
|---|---|---|---|
| 0 | Bronx | Kingsbridge | 70 |
| 1 | Bronx | Fordham | 63 |
| 2 | Bronx | Longwood | 62 |
| 3 | Bronx | Mott Haven | 60 |
| 4 | Bronx | Wakefield | 50 |
| ... | ... | ... | ... |
| 216 | Staten Island | Richmondtown | 1 |
| 217 | Staten Island | New Dorp | 1 |
| 218 | Staten Island | Fort Wadsworth | 1 |
| 219 | Staten Island | Willowbrook | 1 |
| 220 | Staten Island | Woodrow | 1 |
221 rows × 3 columns
Airbnb_clean_listing = px.bar(nei_group, x="neighbourhood",y="count",color="neighbourhood_group",labels={'count': 'count of listings'})
Airbnb_clean_listing.show()
Airbnb_clean_outliers_listing= px.bar(nei_group_outliers, x="neighbourhood",y="count",color="neighbourhood_group",labels={'count': 'count of listings'})
Airbnb_clean_outliers_listing.show()
#scatter plot to know which neighbourhood has the most expensive listing
Airbnb_clean_facet = px.scatter(Airbnb_clean, x = "neighbourhood", y = "price", color="neighbourhood_group" , facet_col="room_type" )
Airbnb_clean_facet.show()
#scatter plot to know which neighbourhood has the most expensive listing aften removing the outliers
Airbnb_clean_outliers_facet = px.scatter(Airbnb_clean_outliers, x = "neighbourhood", y = "price", color="neighbourhood_group" )
Airbnb_clean_outliers_facet.show()
#see geografically where is the listings
scatter_mapbox_clean = px.scatter_mapbox(
Airbnb_clean_outliers,
lat="latitude",
lon="longitude",
color="price",
mapbox_style="open-street-map",
hover_data="price",
hover_name="room_type",
height=800,
title="distribution of listings as per the price",
)
scatter_mapbox_clean.show()
scatter_mapbox_clean = px.scatter_mapbox(
Airbnb_clean_outliers,
lat="latitude",
lon="longitude",
color="neighbourhood_group",
mapbox_style="open-street-map",
hover_data="price",
hover_name="room_type",
height=800,
title="distribution of listings for each neighbourhood_group",
)
scatter_mapbox_clean.show()
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import seaborn as sns
# Load the data
Airbnb = pd.read_csv("/content/AB_NYC_2019.csv")
# Extract relevant features for clustering
features = ["room_type", "latitude", "longitude", "price", "number_of_reviews", "calculated_host_listings_count"]
X_airbnb = Airbnb[features].values
# Define column indices for numerical and categorical features
numerical_cols = [1, 2, 3, 4, 5]
categorical_cols = [0]
# Create transformers
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()
# Create preprocessor
preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_cols),
('cat', categorical_transformer, categorical_cols)
],
remainder='passthrough'
)
# Apply the preprocessor to the data and remove rows with missing values
X_airbnb_encoded = preprocessor.fit_transform(pd.DataFrame(X_airbnb))
X_airbnb_encoded = X_airbnb_encoded[~np.isnan(X_airbnb_encoded).any(axis=1)]
# Perform K-means clustering with the optimal K
optimal_k = 7
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
clusters = kmeans.fit_predict(X_airbnb_encoded)
# Convert the data and cluster labels to a DataFrame for Seaborn
feature_names = ["Room Type", "Latitude", "Longitude", "Price", "Number of Reviews", "Host Listings Count"]
data_with_clusters = pd.DataFrame(X_airbnb, columns=feature_names)
data_with_clusters['Cluster'] = clusters
# Add the 'neighbourhood_group' column to data_with_clusters
data_with_clusters['neighbourhood_group'] = Airbnb['neighbourhood_group']
# Labeling categorical dummy variables for descriptive statistics
data_with_clusters["Room Type"] = X_airbnb[:, 0]
data_with_clusters.head()
#cluster_stats_numeric = data_with_clusters.groupby('Cluster').apply(lambda x: x.describe()).transpose()
cluster_stats = data_with_clusters.groupby('Cluster').agg(['min', 'max', 'mean', 'std']).transpose()
numerical_columns = ['Latitude', 'Longitude', 'Price', 'Number of Reviews', 'Host Listings Count']
data_with_clusters[numerical_columns] = data_with_clusters[numerical_columns].apply(pd.to_numeric, errors='coerce')
# Percentage of each neighbourhood_group for each cluster
room_type_percentage = data_with_clusters.groupby(['Cluster', 'Room Type']).size().unstack(fill_value=0).div(data_with_clusters['Cluster'].value_counts(), axis=0) * 100
neighbourhood_percentage = data_with_clusters.groupby(['Cluster', 'neighbourhood_group']).size().unstack(fill_value=0).div(data_with_clusters['Cluster'].value_counts(), axis=0) * 100
pd.set_option('display.max_columns', None)
print(cluster_stats)
print(room_type_percentage.T)
print(neighbourhood_percentage.T)
# Create five violin plots with the same category variable
fig, axes = plt.subplots(5, 1, figsize=(5, 20), sharey=False)
# Plot each numerical variable separately
for i, value_col in enumerate(['Latitude', 'Longitude', 'Price', 'Number of Reviews', 'Host Listings Count']):
sns.violinplot(x='Cluster', y=value_col, data=data_with_clusters, ax=axes[i])
axes[i].set_title(f'Violin Plot for {value_col}')
# Show the plots
plt.show()
# Visualize clusters using pairplot with different colors for each cluster
sns.pairplot(data_with_clusters, hue='Cluster', palette='viridis', vars=feature_names[1:])
plt.show()
# PCA Visualization with different colors for each cluster
pca = PCA(n_components=2, random_state=42)
pca_result = pca.fit_transform(X_airbnb_encoded)
data_with_clusters_pca = pd.DataFrame(np.column_stack([pca_result, clusters]), columns=['PCA Component 1', 'PCA Component 2', 'Cluster'])
sns.scatterplot(data=data_with_clusters_pca, x='PCA Component 1', y='PCA Component 2', hue='Cluster', palette='viridis', s=50, alpha=0.8)
plt.title('PCA Visualization of KMeans Clusters')
plt.show()
# K-means clusters based on latitude and longitude, colored by cluster using original dataset
plt.scatter(Airbnb['latitude'], Airbnb['longitude'], c=clusters, cmap='viridis', s=50, alpha=0.8)
plt.title('K-means Clusters Based on Latitude and Longitude')
plt.xlabel('Latitude')
plt.ylabel('Longitude')
plt.show()
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning <ipython-input-43-ab1566537851>:55: FutureWarning: ['Room Type', 'neighbourhood_group'] did not aggregate successfully. If any error is raised this will raise in a future version of pandas. Drop these columns/ops to avoid this warning.
Cluster 0 1 2 3 \
Latitude min 40.506410 40.583630 40.735500 40.538710
max 40.822630 40.825110 40.913060 40.898110
mean 40.713820 40.733868 40.801172 40.723546
std 0.037672 0.045229 0.032190 0.050113
Longitude min -74.244420 -74.097300 -73.999940 -74.169660
max -73.837460 -73.770690 -73.829320 -73.746270
mean -73.973330 -73.977600 -73.942120 -73.953578
std 0.028495 0.039361 0.026886 0.039106
Price min 0.000000 2545.000000 0.000000 0.000000
max 2500.000000 10000.000000 2500.000000 1050.000000
mean 224.457766 4820.015152 115.564625 128.122311
std 198.782479 2237.169651 83.734338 83.091550
Number of Reviews min 0.000000 0.000000 0.000000 81.000000
max 109.000000 69.000000 143.000000 629.000000
mean 12.727838 2.651515 15.054296 157.637984
std 19.116081 8.882987 22.210175 66.333955
Host Listings Count min 1.000000 1.000000 1.000000 1.000000
max 121.000000 12.000000 103.000000 28.000000
mean 5.382564 2.772727 2.976965 1.980332
std 17.264435 3.724329 8.968432 1.896065
Cluster 4 5 6
Latitude min 40.703720 40.499790 40.565460
max 40.790940 40.763220 40.866460
mean 40.729629 40.695390 40.710963
std 0.023925 0.034129 0.050271
Longitude min -74.017120 -74.242850 -73.900020
max -73.949100 -73.884260 -73.712990
mean -73.996995 -73.956629 -73.826358
std 0.014972 0.034042 0.044847
Price min 100.000000 0.000000 10.000000
max 699.000000 800.000000 1500.000000
mean 273.928444 79.142267 95.403955
std 100.975965 47.371032 90.731393
Number of Reviews min 0.000000 0.000000 0.000000
max 20.000000 100.000000 212.000000
mean 2.343470 12.143374 24.406780
std 3.775523 18.743582 30.796190
Host Listings Count min 232.000000 1.000000 1.000000
max 327.000000 103.000000 103.000000
mean 287.572451 3.256109 3.526483
std 46.850957 8.457560 8.337037
0 1 2 3 4 \
Room Type
Entire home/apt 98.463190 81.818182 41.261426 53.657037 98.568873
Private room 0.813605 18.181818 54.890311 44.714198 1.431127
Shared room 0.723205 0.000000 3.848263 1.628765 0.000000
5 6
Room Type
Entire home/apt 0.465116 37.217514
Private room 96.293835 58.580508
Shared room 3.241048 4.201977
0 1 2 3 4 \
neighbourhood_group
Bronx 0.000000 0.000000 8.930530 1.167793 0.000000
Brooklyn 47.771061 22.727273 0.000000 47.910264 0.357782
Manhattan 49.104469 72.727273 75.740402 39.551321 99.642218
Queens 2.180914 3.030303 15.329068 10.540873 0.000000
Staten Island 0.943556 1.515152 0.000000 0.829748 0.000000
5 6
neighbourhood_group
Bronx 0.000000 2.683616
Brooklyn 72.595054 8.474576
Manhattan 20.612772 0.000000
Queens 5.478036 88.841808
Staten Island 1.314138 0.000000
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.cm as cm
range_n_clusters = [2, 3, 4, 5, 6, 7, 8]
for n_clusters in range_n_clusters:
# Create a subplot with 1 row and 2 columns
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7)
# The 1st subplot is the silhouette plot
# The silhouette coefficient can range from -1, 1 but in this example all
# lie within [-0.1, 1]
ax1.set_xlim([-0.1, 1])
# The (n_clusters+1)*10 is for inserting blank space between silhouette
# plots of individual clusters, to demarcate them clearly.
ax1.set_ylim([0, len(X_airbnb_encoded) + (n_clusters + 1) * 10])
# Initialize the clusterer with n_clusters value and a random generator
# seed of 10 for reproducibility.
clusterer = KMeans(n_clusters=n_clusters, n_init="auto", random_state=10)
cluster_labels = clusterer.fit_predict(X_airbnb_encoded)
# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed
# clusters
silhouette_avg = silhouette_score(X_airbnb_encoded, cluster_labels)
print(
"For n_clusters =",
n_clusters,
"The average silhouette_score is :",
silhouette_avg,
)
# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(X_airbnb_encoded, cluster_labels)
y_lower = 10
for i in range(n_clusters):
# Aggregate the silhouette scores for samples belonging to
# cluster i, and sort them
ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = cm.nipy_spectral(float(i) / n_clusters)
ax1.fill_betweenx(
np.arange(y_lower, y_upper),
0,
ith_cluster_silhouette_values,
facecolor=color,
edgecolor=color,
alpha=0.7,
)
# Label the silhouette plots with their cluster numbers at the middle
ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
# Compute the new y_lower for next plot
y_lower = y_upper + 10 # 10 for the 0 samples
ax1.set_title("The silhouette plot for the various clusters.")
ax1.set_xlabel("The silhouette coefficient values")
ax1.set_ylabel("Cluster label")
# The vertical line for average silhouette score of all the values
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
ax1.set_yticks([]) # Clear the yaxis labels / ticks
ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
# 2nd Plot showing the actual clusters formed
colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
ax2.scatter(
X_airbnb_encoded[:, 0], X_airbnb_encoded[:, 1], marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k"
)
# Labeling the clusters
centers = clusterer.cluster_centers_
# Draw white circles at cluster centers
ax2.scatter(
centers[:, 0],
centers[:, 1],
marker="o",
c="white",
alpha=1,
s=200,
edgecolor="k",
)
for i, c in enumerate(centers):
ax2.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k")
ax2.set_title("The visualization of the clustered data.")
ax2.set_xlabel("Feature space for the 1st feature")
ax2.set_ylabel("Feature space for the 2nd feature")
plt.suptitle(
"Silhouette analysis for KMeans clustering on sample data with n_clusters = %d"
% n_clusters,
fontsize=14,
fontweight="bold",
)
plt.show()
For n_clusters = 2 The average silhouette_score is : 0.1909706201676494 For n_clusters = 3 The average silhouette_score is : 0.19924914334117888 For n_clusters = 4 The average silhouette_score is : 0.21436777840705593 For n_clusters = 5 The average silhouette_score is : 0.2164208271656542 For n_clusters = 6 The average silhouette_score is : 0.24978012466829047 For n_clusters = 7 The average silhouette_score is : 0.28402181291564366 For n_clusters = 8 The average silhouette_score is : 0.28298092537662833
# Determine the range of K values to explore
k_values = range(1, 11)
# Calculate inertias for different K values
inertias = [KMeans(n_clusters=k, random_state=42, n_init=10).fit(X_airbnb_encoded).inertia_ for k in k_values]
# Plot the elbow curve
plt.plot(k_values, inertias, marker='o')
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia')
plt.show()
Repeat clustering with removal of 0 price rooms
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import seaborn as sns
# Load the data
Airbnb = pd.read_csv("/content/AB_NYC_2019.csv")
# Remove rows with 0 value that do not make sense
Airbnb = Airbnb[Airbnb['price'] != 0]
# Extract relevant features for clustering
features = ["room_type", "latitude", "longitude", "price", "number_of_reviews", "calculated_host_listings_count"]
X_airbnb = Airbnb[features].values
# Define column indices for numerical and categorical features
numerical_cols = [1, 2, 3, 4, 5]
categorical_cols = [0]
# Create transformers
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()
# Create preprocessor
preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_cols),
('cat', categorical_transformer, categorical_cols)
],
remainder='passthrough'
)
# Apply the preprocessor to the data and remove rows with missing values
X_airbnb_encoded = preprocessor.fit_transform(pd.DataFrame(X_airbnb))
X_airbnb_encoded = X_airbnb_encoded[~np.isnan(X_airbnb_encoded).any(axis=1)]
# Perform K-means clustering with the optimal K
optimal_k = 7
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
clusters = kmeans.fit_predict(X_airbnb_encoded)
# Convert the data and cluster labels to a DataFrame for Seaborn
feature_names = ["Room Type", "Latitude", "Longitude", "Price", "Number of Reviews", "Host Listings Count"]
data_with_clusters = pd.DataFrame(X_airbnb, columns=feature_names)
data_with_clusters['Cluster'] = clusters
# Add the 'neighbourhood_group' column to data_with_clusters
data_with_clusters['neighbourhood_group'] = Airbnb['neighbourhood_group']
# Labeling categorical dummy variables for descriptive statistics
data_with_clusters["Room Type"] = X_airbnb[:, 0]
data_with_clusters.head()
#cluster_stats_numeric = data_with_clusters.groupby('Cluster').apply(lambda x: x.describe()).transpose()
cluster_stats = data_with_clusters.groupby('Cluster').agg(['min', 'max', 'mean', 'std']).transpose()
numerical_columns = ['Latitude', 'Longitude', 'Price', 'Number of Reviews', 'Host Listings Count']
data_with_clusters[numerical_columns] = data_with_clusters[numerical_columns].apply(pd.to_numeric, errors='coerce')
# Percentage of each neighbourhood_group for each cluster
room_type_percentage = data_with_clusters.groupby(['Cluster', 'Room Type']).size().unstack(fill_value=0).div(data_with_clusters['Cluster'].value_counts(), axis=0) * 100
neighbourhood_percentage = data_with_clusters.groupby(['Cluster', 'neighbourhood_group']).size().unstack(fill_value=0).div(data_with_clusters['Cluster'].value_counts(), axis=0) * 100
pd.set_option('display.max_columns', None)
print(cluster_stats)
print(room_type_percentage.T)
print(neighbourhood_percentage.T)
# Create five violin plots with the same category variable
fig, axes = plt.subplots(5, 1, figsize=(5, 20), sharey=False)
# Plot each numerical variable separately
for i, value_col in enumerate(['Latitude', 'Longitude', 'Price', 'Number of Reviews', 'Host Listings Count']):
sns.violinplot(x='Cluster', y=value_col, data=data_with_clusters, ax=axes[i])
axes[i].set_title(f'Violin Plot for {value_col}')
# Show the plots
plt.show()
# Visualize clusters using pairplot with different colors for each cluster
sns.pairplot(data_with_clusters, hue='Cluster', palette='viridis', vars=feature_names[1:])
plt.show()
# PCA Visualization with different colors for each cluster
pca = PCA(n_components=2, random_state=42)
pca_result = pca.fit_transform(X_airbnb_encoded)
data_with_clusters_pca = pd.DataFrame(np.column_stack([pca_result, clusters]), columns=['PCA Component 1', 'PCA Component 2', 'Cluster'])
sns.scatterplot(data=data_with_clusters_pca, x='PCA Component 1', y='PCA Component 2', hue='Cluster', palette='viridis', s=50, alpha=0.8)
plt.title('PCA Visualization of KMeans Clusters')
plt.show()
# K-means clusters based on latitude and longitude, colored by cluster using original dataset
plt.scatter(Airbnb['latitude'], Airbnb['longitude'], c=clusters, cmap='viridis', s=50, alpha=0.8)
plt.title('K-means Clusters Based on Latitude and Longitude')
plt.xlabel('Latitude')
plt.ylabel('Longitude')
plt.show()
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning <ipython-input-47-fdb1565cae29>:58: FutureWarning: ['Room Type', 'neighbourhood_group'] did not aggregate successfully. If any error is raised this will raise in a future version of pandas. Drop these columns/ops to avoid this warning.
Cluster 0 1 2 3 \
Latitude min 40.735500 40.506410 40.703720 40.499790
max 40.913060 40.806460 40.790940 40.762150
mean 40.801094 40.713863 40.729629 40.695217
std 0.032203 0.037627 0.023925 0.034103
Longitude min -73.999940 -74.244420 -74.017120 -74.242850
max -73.829320 -73.892100 -73.949100 -73.884260
mean -73.942200 -73.973348 -73.996995 -73.956593
std 0.026907 0.028470 0.014972 0.034047
Price min 10.000000 10.000000 100.000000 10.000000
max 1200.000000 2300.000000 699.000000 800.000000
mean 115.393190 222.900616 273.928444 79.125092
std 80.615230 189.914967 100.975965 47.500997
Number of Reviews min 0.000000 0.000000 0.000000 0.000000
max 143.000000 109.000000 20.000000 100.000000
mean 15.047928 12.729889 2.343470 12.164225
std 22.204045 19.117426 3.775523 18.773961
Host Listings Count min 1.000000 1.000000 232.000000 1.000000
max 103.000000 121.000000 327.000000 103.000000
mean 2.959102 5.396461 287.572451 3.255215
std 8.898915 17.289021 46.850957 8.462379
Cluster 4 5 6
Latitude min 40.538710 40.565460 40.583630
max 40.898110 40.866460 40.886710
mean 40.723633 40.710982 40.736699
std 0.050108 0.050251 0.047313
Longitude min -74.169660 -73.900020 -74.097300
max -73.746270 -73.712990 -73.770690
mean -73.953674 -73.826488 -73.975251
std 0.039081 0.044890 0.040526
Price min 10.000000 10.000000 2350.000000
max 1050.000000 1500.000000 10000.000000
mean 128.295350 95.379493 4435.075949
std 83.072854 90.647730 2221.049843
Number of Reviews min 81.000000 0.000000 0.000000
max 629.000000 212.000000 69.000000
mean 157.777333 24.431642 2.886076
std 66.336899 30.835174 8.522951
Host Listings Count min 1.000000 1.000000 1.000000
max 28.000000 103.000000 12.000000
mean 1.977210 3.521494 2.721519
std 1.894012 8.328940 3.569589
0 1 2 3 4 \
Room Type
Entire home/apt 41.126529 98.451015 98.568873 0.510431 53.711118
Private room 55.030126 0.808412 1.431127 96.279035 44.656606
Shared room 3.843345 0.740573 0.000000 3.210534 1.632276
5 6
Room Type
Entire home/apt 37.279774 82.278481
Private room 58.527132 17.721519
Shared room 4.193094 0.000000
0 1 2 3 4 \
neighbourhood_group
Bronx 4.272412 1.582905 2.146691 1.494304 1.385895
Brooklyn 20.887347 44.287410 20.572451 54.616067 47.859563
Manhattan 60.096768 45.372831 65.116279 33.274153 39.944564
Queens 14.186599 7.908870 11.806798 9.705578 9.947644
Staten Island 0.547745 0.831025 0.357782 0.865513 0.862334
5 6
neighbourhood_group
Bronx 2.924595 0.000000
Brooklyn 31.465821 30.379747
Manhattan 29.668781 55.696203
Queens 35.271318 12.658228
Staten Island 0.634249 1.265823
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
from scipy import stats
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from math import sqrt
from sklearn.metrics import r2_score
nyc_data = pd.read_csv('/content/AB_NYC_2019.csv')
nyc_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 48895 entries, 0 to 48894 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 48895 non-null int64 1 name 48879 non-null object 2 host_id 48895 non-null int64 3 host_name 48874 non-null object 4 neighbourhood_group 48895 non-null object 5 neighbourhood 48895 non-null object 6 latitude 48895 non-null float64 7 longitude 48895 non-null float64 8 room_type 48895 non-null object 9 price 48895 non-null int64 10 minimum_nights 48895 non-null int64 11 number_of_reviews 48895 non-null int64 12 last_review 38843 non-null object 13 reviews_per_month 38843 non-null float64 14 calculated_host_listings_count 48895 non-null int64 15 availability_365 48895 non-null int64 dtypes: float64(3), int64(7), object(6) memory usage: 6.0+ MB
nyc_data.head(10)
| id | name | host_id | host_name | neighbourhood_group | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2539 | Clean & quiet apt home by the park | 2787 | John | Brooklyn | Kensington | 40.64749 | -73.97237 | Private room | 149 | 1 | 9 | 2018-10-19 | 0.21 | 6 | 365 |
| 1 | 2595 | Skylit Midtown Castle | 2845 | Jennifer | Manhattan | Midtown | 40.75362 | -73.98377 | Entire home/apt | 225 | 1 | 45 | 2019-05-21 | 0.38 | 2 | 355 |
| 2 | 3647 | THE VILLAGE OF HARLEM....NEW YORK ! | 4632 | Elisabeth | Manhattan | Harlem | 40.80902 | -73.94190 | Private room | 150 | 3 | 0 | NaN | NaN | 1 | 365 |
| 3 | 3831 | Cozy Entire Floor of Brownstone | 4869 | LisaRoxanne | Brooklyn | Clinton Hill | 40.68514 | -73.95976 | Entire home/apt | 89 | 1 | 270 | 2019-07-05 | 4.64 | 1 | 194 |
| 4 | 5022 | Entire Apt: Spacious Studio/Loft by central park | 7192 | Laura | Manhattan | East Harlem | 40.79851 | -73.94399 | Entire home/apt | 80 | 10 | 9 | 2018-11-19 | 0.10 | 1 | 0 |
| 5 | 5099 | Large Cozy 1 BR Apartment In Midtown East | 7322 | Chris | Manhattan | Murray Hill | 40.74767 | -73.97500 | Entire home/apt | 200 | 3 | 74 | 2019-06-22 | 0.59 | 1 | 129 |
| 6 | 5121 | BlissArtsSpace! | 7356 | Garon | Brooklyn | Bedford-Stuyvesant | 40.68688 | -73.95596 | Private room | 60 | 45 | 49 | 2017-10-05 | 0.40 | 1 | 0 |
| 7 | 5178 | Large Furnished Room Near B'way | 8967 | Shunichi | Manhattan | Hell's Kitchen | 40.76489 | -73.98493 | Private room | 79 | 2 | 430 | 2019-06-24 | 3.47 | 1 | 220 |
| 8 | 5203 | Cozy Clean Guest Room - Family Apt | 7490 | MaryEllen | Manhattan | Upper West Side | 40.80178 | -73.96723 | Private room | 79 | 2 | 118 | 2017-07-21 | 0.99 | 1 | 0 |
| 9 | 5238 | Cute & Cozy Lower East Side 1 bdrm | 7549 | Ben | Manhattan | Chinatown | 40.71344 | -73.99037 | Entire home/apt | 150 | 1 | 160 | 2019-06-09 | 1.33 | 4 | 188 |
nyc_data.isnull().sum()
id 0 name 16 host_id 0 host_name 21 neighbourhood_group 0 neighbourhood 0 latitude 0 longitude 0 room_type 0 price 0 minimum_nights 0 number_of_reviews 0 last_review 10052 reviews_per_month 10052 calculated_host_listings_count 0 availability_365 0 dtype: int64
Convert the non numerical field to categorical
nyc_data['neighbourhood_group']= nyc_data['neighbourhood_group'].astype("category").cat.codes
nyc_data['neighbourhood'] = nyc_data['neighbourhood'].astype("category").cat.codes
nyc_data['room_type'] = nyc_data['room_type'].astype("category").cat.codes
nyc_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 48895 entries, 0 to 48894 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 48895 non-null int64 1 name 48879 non-null object 2 host_id 48895 non-null int64 3 host_name 48874 non-null object 4 neighbourhood_group 48895 non-null int8 5 neighbourhood 48895 non-null int16 6 latitude 48895 non-null float64 7 longitude 48895 non-null float64 8 room_type 48895 non-null int8 9 price 48895 non-null int64 10 minimum_nights 48895 non-null int64 11 number_of_reviews 48895 non-null int64 12 last_review 38843 non-null object 13 reviews_per_month 38843 non-null float64 14 calculated_host_listings_count 48895 non-null int64 15 availability_365 48895 non-null int64 dtypes: float64(3), int16(1), int64(7), int8(2), object(3) memory usage: 5.0+ MB
plt.figure(figsize=(10,10))
sns.distplot(nyc_data['price'], fit=norm)
plt.title("Price Distribution Plot",size=15, weight='bold')
<ipython-input-54-63eaf33967f1>:2: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
Text(0.5, 1.0, 'Price Distribution Plot')
The above distribution graph shows that there is a right-skewed distribution on price. This means there is a positive skewness. We used Log transformation to make this feature less skewed.
Since division by zero is a problem, log+1 transformation would be better.
nyc_data['price_log'] = np.log(nyc_data.price+1)
plt.figure(figsize=(12,10))
sns.distplot(nyc_data['price_log'], fit=norm)
plt.title("Log-Price Distribution Plot",size=15, weight='bold')
<ipython-input-56-41f083519966>:2: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
Text(0.5, 1.0, 'Log-Price Distribution Plot')
The good fit indicates that normality is a reasonable approximation.
plt.figure(figsize=(7,7))
stats.probplot(nyc_data['price_log'], plot=plt)
plt.show()
non-nominal data and old price feature are eliminated.
nyc_model = nyc_data.drop(columns=['name','id' ,'host_id','host_name',
'last_review','price'])
nyc_model.isnull().sum()
neighbourhood_group 0 neighbourhood 0 latitude 0 longitude 0 room_type 0 minimum_nights 0 number_of_reviews 0 reviews_per_month 10052 calculated_host_listings_count 0 availability_365 0 price_log 0 dtype: int64
Number of reviews has some missing data. The missing data can be replaced with mean. Since the data is more symmetric, mean replacement makes sense.
mean = nyc_model['reviews_per_month'].mean()
nyc_model['reviews_per_month'].fillna(mean, inplace=True)
nyc_model.isnull().sum()
neighbourhood_group 0 neighbourhood 0 latitude 0 longitude 0 room_type 0 minimum_nights 0 number_of_reviews 0 reviews_per_month 0 calculated_host_listings_count 0 availability_365 0 price_log 0 dtype: int64
A correlation matrix is created with the Pearson method
plt.figure(figsize=(15,12))
palette = sns.diverging_palette(20, 220, n=256)
corr=nyc_model.corr(method='pearson')
sns.heatmap(corr, annot=True, fmt=".2f", cmap=palette, vmax=.3, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5}).set(ylim=(11, 0))
plt.title("Correlation Matrix",size=15, weight='bold')
Text(0.5, 1.0, 'Correlation Matrix')
The correlation table shows that there is no strong relationship between price and other features. This indicates no feature needed to be taken out of data.
nyc_model_x, nyc_model_y = nyc_model.iloc[:,:-1], nyc_model.iloc[:,-1]
# removes the mean and scales each feature/variable to unit variancescaler = StandardScaler()
scaler = StandardScaler()
nyc_model_x = scaler.fit_transform(nyc_model_x)
Data is split in a 70–30 ratio
X_train, X_test, y_train, y_test = train_test_split(nyc_model_x, nyc_model_y, test_size=0.3,random_state=42)
Build a Linear Regression, Ridge Regression, Lasso Regression, and ElasticNet Regression. The models will be used to avoiding plain Linear Regression and show the results with a little of regularization.
GridSearchCV algorithm will be used to find the best parameters and tuning hyperparameters for each model. In this algorithm 5-Fold Cross Validation and Mean Squared Error Regression Loss metrics will be used.
Before model building, 5-Fold Cross Validation will be implemented for validation.
kfold_cv=KFold(n_splits=5, random_state=42, shuffle=True)
for train_index, test_index in kfold_cv.split(nyc_model_x,nyc_model_y):
X_train, X_test = nyc_model_x[train_index], nyc_model_x[test_index]
y_train, y_test = nyc_model_y[train_index], nyc_model_y[test_index]
# Identify the indices where y_train is not NaN
valid_indices = ~np.isnan(y_train)
# Filter out corresponding rows in X_train and y_train
X_train_clean = X_train[valid_indices]
y_train_clean = y_train[valid_indices]
X_train = X_train_clean
y_train = y_train_clean
##Linear Regression
lr = LinearRegression(copy_X= True, fit_intercept = True)
lr.fit(X_train, y_train)
lr_pred= lr.predict(X_test)
#Ridge Model
ridge_model = Ridge(alpha = 0.01)
ridge_model.fit(X_train, y_train)
pred_ridge = ridge_model.predict(X_test)
#Lasso Model
Lasso_model = Lasso(alpha = 0.001)
Lasso_model.fit(X_train, y_train)
pred_Lasso = Lasso_model.predict(X_test)
#ElasticNet Model
model_enet = ElasticNet(alpha = 0.01)
model_enet.fit(X_train, y_train)
pred_test_enet= model_enet.predict(X_test)
###Linear Regression
lr_x = LinearRegression(copy_X= True, fit_intercept = True)
lr_x.fit(X_train, y_train)
lr_pred_x= lr_x.predict(X_test)
###Ridge
ridge_x = Ridge(alpha = 0.01)
ridge_x.fit(X_train, y_train)
pred_ridge_x = ridge_x.predict(X_test)
###Lasso
Lasso_x = Lasso(alpha = 0.001)
Lasso_x.fit(X_train, y_train)
pred_Lasso_x = Lasso_x.predict(X_test)
##ElasticNet
model_enet_x = ElasticNet(alpha = 0.01)
model_enet_x.fit(X_train, y_train)
pred_train_enet_x= model_enet_x.predict(X_train)
pred_test_enet_x= model_enet_x.predict(X_test)
Metrics to evaluate predictions.
Mean Absolute Error (MAE) shows the difference between predictions and actual values.
Root Mean Square Error (RMSE) shows how accurately the model predicts the response.
``R^2`` will be calculated to find the goodness of fit measure.
print('-------------Lineer Regression-----------')
print('MAE: %f'% mean_absolute_error(y_test, lr_pred))
print('RMSE: %f'% np.sqrt(mean_squared_error(y_test, lr_pred)))
print('R2 %f' % r2_score(y_test, lr_pred))
print('---------------Ridge ---------------------')
print('MAE: %f'% mean_absolute_error(y_test, pred_ridge))
print('RMSE: %f'% np.sqrt(mean_squared_error(y_test, pred_ridge)))
print('R2 %f' % r2_score(y_test, pred_ridge))
print('---------------Lasso-----------------------')
print('MAE: %f' % mean_absolute_error(y_test, pred_Lasso))
print('RMSE: %f' % np.sqrt(mean_squared_error(y_test, pred_Lasso)))
print('R2 %f' % r2_score(y_test, pred_Lasso))
print('---------------ElasticNet-------------------')
print('MAE: %f' % mean_absolute_error(y_test,pred_test_enet)) #RMSE
print('RMSE: %f' % np.sqrt(mean_squared_error(y_test,pred_test_enet))) #RMSE
print('R2 %f' % r2_score(y_test, pred_test_enet))
-------------Lineer Regression----------- MAE: 0.370140 RMSE: 0.523551 R2 0.446063 ---------------Ridge --------------------- MAE: 0.370140 RMSE: 0.523551 R2 0.446063 ---------------Lasso----------------------- MAE: 0.370153 RMSE: 0.523620 R2 0.445918 ---------------ElasticNet------------------- MAE: 0.370481 RMSE: 0.524137 R2 0.444822
# Create a single row with 4 columns for the subplots
fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(30, 10)) # Adjusted to 1 row, 4 columns
fig.suptitle('True Values vs Predictions')
# Plot each model's predictions
ax1.scatter(y_test, lr_pred)
ax1.set_title('Linear Regression - Phase-1')
ax2.scatter(y_test, pred_ridge)
ax2.set_title('Ridge - Phase-1')
ax3.scatter(y_test, pred_Lasso)
ax3.set_title('Lasso - Phase-1')
ax4.scatter(y_test, pred_test_enet)
ax4.set_title('ElasticNet - Phase-1')
# Set labels for each axis in all subplots
for ax in [ax1, ax2, ax3, ax4]:
ax.set(xlabel='True Values', ylabel='Predictions')
# Display the plot
plt.show()